# lets grab the state prefix data. We will put this on most of our datasets.
state_abb <- toupper(state.abb)
state_name <- state.name
state_region <- as.character(state.region)
state_division <- as.character(state.division)
state_prefix <- data.frame( state_abb,state_name, state_division,state_region)
str(state_prefix)
## 'data.frame': 50 obs. of 4 variables:
## $ state_abb : chr "AL" "AK" "AZ" "AR" ...
## $ state_name : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ state_division: chr "East South Central" "Pacific" "Mountain" "West South Central" ...
## $ state_region : chr "South" "West" "West" "South" ...
state_csv <- list.files("Data Pull and Transformation/data/Population and Salary/states/")
datascientist_salaries <- as_tibble() # Initializing a dataframe
x <- "Alaska.csv" # Using this just to test the code
for(x in state_csv){
file_path <- paste0("Data Pull and Transformation/data/Population and Salary/states/",x)
state_salaries <- read_csv(file_path) %>%
mutate(
`Salary Estimate` = as.character(`Salary Estimate`),
Founded = as.character(Founded)
)
### The data has -1 instead of NA's. We are using NA's not -1.
state_salaries[state_salaries == -1] <- NA
datascientist_salaries <- union_all(datascientist_salaries,state_salaries)
datascientist_salaries
}
### Creating column names
dss_headers <- colnames(datascientist_salaries)
### Cleaning out these periods and lower the upper cases
new_col <- gsub("\\ ","_",tolower(dss_headers))
colnames(datascientist_salaries) <- new_col
### We need the city and state for where the job is. Creating a dataframe
df_location <- as_tibble(str_split(datascientist_salaries$location,", ", simplify = TRUE))
location_name <- c("city","state","other")
### Adding the columns back to the original dataframe
colnames(df_location) <- location_name
datascientist_salaries <- cbind(datascientist_salaries,df_location)
# Cleaning Section
### Due to the volume of data we have we are going to remove the rows that have
### data in the column other and any NA in the salaries.
### When removing the other data set and the salaries with NA's we remove almost
### 100 records out of 12,537. That feels right.
### We need the min and max salary estimate. We will take the mid point in our
### Analysis
size_name <- c("min_size","max_size")
df_size <- str_split_fixed(datascientist_salaries$size, " to ",2)
colnames(df_size) <- size_name
df_size <- as.data.frame(df_size)
df_size <- df_size %>%
mutate(
min_size = case_when(min_size == "10000+ Employees" ~ "10000",
min_size == "Unknown" ~ "0",
min_size == "" ~ "0",
TRUE ~ min_size),
max_size = str_remove(max_size," Employees"),
max_size = if_else(max_size == "", min_size, max_size),
min_size = as.numeric(min_size),
max_size = as.numeric(max_size)
)
datascientist_salaries <- cbind(datascientist_salaries, df_size)
datascientist_salaries$job_title <- str_replace_all(datascientist_salaries$job_title,"[^[:graph:]]", " ")
datascientist_salaries$job_description <- str_replace_all(datascientist_salaries$job_description,"[^[:graph:]]", " ")
df_data_science_drop <- datascientist_salaries %>%
filter(state != "DC" & !is.na(salary_estimate)) %>%
left_join(state_prefix, by = c("city"="state_name")) %>%
mutate(
state_abb = case_when(state =="" ~ state_abb,
state == "Los Angeles" ~ "CA",
TRUE ~ state)
) %>%
filter(
!is.na(state_abb) &
(
# tolower(job_function) %notin% c("data analyst","data scientist","data engineer","business analyst") |
str_detect(tolower(job_function), "data") == FALSE &
str_detect(tolower(job_function), "engineer") == FALSE &
str_detect(tolower(job_function), "analyst") == FALSE &
str_detect(tolower(job_function), "business") == FALSE &
str_detect(tolower(job_function), "model") == FALSE &
str_detect(tolower(job_function), "consult") == FALSE &
str_detect(tolower(job_function), "analytic") == FALSE &
str_detect(tolower(job_function), "finance") == FALSE &
str_detect(tolower(job_function), "stat") == FALSE &
str_detect(tolower(job_function), "chain") == FALSE &
str_detect(job_function, "AI") == FALSE &
### Bringing in by title
str_detect(tolower(job_title), "data") == FALSE &
str_detect(tolower(job_title), "engineer") == FALSE &
str_detect(tolower(job_title), "analyst") == FALSE &
str_detect(tolower(job_title), "business") == FALSE &
str_detect(tolower(job_title), "model") == FALSE &
str_detect(tolower(job_title), "consult") == FALSE &
str_detect(tolower(job_title), "analytic") == FALSE &
str_detect(tolower(job_title), "finance") == FALSE &
str_detect(tolower(job_title), "stat") == FALSE &
str_detect(job_title, "AI") == FALSE
)
)
df_data_science_jobs <- datascientist_salaries %>%
left_join(state_prefix, by = c("city"="state_name")) %>%
mutate(
state_abb = case_when(state =="" ~ state_abb,
#state == "Los Angeles" ~ "CA",
str_count(state) != 2 ~ other,
TRUE ~ state)
) %>%
filter(
state != "DC" & !is.na(salary_estimate) & !is.na(state_abb) &
(
# tolower(job_function) %notin% c("data analyst","data scientist","data engineer","business analyst") |
str_detect(tolower(job_function), "data") |
str_detect(tolower(job_function), "engineer") |
str_detect(tolower(job_function), "analyst") |
str_detect(tolower(job_function), "business") |
str_detect(tolower(job_function), "model") |
str_detect(tolower(job_function), "consult") |
str_detect(tolower(job_function), "analytic") |
str_detect(tolower(job_function), "finance") |
str_detect(tolower(job_function), "stat") |
str_detect(tolower(job_function), "chain") |
str_detect(job_function, "AI") |
### Bringing in by title
str_detect(tolower(job_title), "data") |
str_detect(tolower(job_title), "engineer") |
str_detect(tolower(job_title), "analyst") |
str_detect(tolower(job_title), "business") |
str_detect(tolower(job_title), "model") |
str_detect(tolower(job_title), "consult") |
str_detect(tolower(job_title), "analytic") |
str_detect(tolower(job_title), "finance") |
str_detect(tolower(job_title), "stat") |
str_detect(job_title, "AI")
)
) %>%
mutate(
salary_estimate = gsub("\\(","",salary_estimate),
salary_estimate = gsub("\\)","",salary_estimate),
min_salary = as.numeric(str_extract(salary_estimate, "\\d+"))*1000,
max_salary = as.numeric(str_extract(salary_estimate, "\\d+(?=\\K Glassdoor est.)"))*1000,
max_salary = if_else(is.na(max_salary),min_salary,max_salary),
avg_salary = (max_salary + min_salary) / 2,
avg_size = (max_size + min_size) / 2,
mega_job_title = case_when(
str_detect(tolower(job_title), "entry") ~ "Entry",
str_detect(tolower(job_title), "vp") ~ "Senior",
str_detect(tolower(job_title), "vice president") ~ "Leader",
str_detect(tolower(job_title), "manager") ~ "Leader",
str_detect(tolower(job_title), "senior") ~ "Senior",
str_detect(tolower(job_title), "sr") ~ "Senior",
str_detect(tolower(job_title), "principal") ~ "Senior",
str_detect(tolower(job_title), "lead") ~ "Senior",
str_detect(tolower(job_title), "ii") ~ "Senior",
str_detect(tolower(job_title), "2") ~ "Senior",
str_detect(tolower(job_title),"associate") ~ "Senior",
str_detect(tolower(job_title), "director") ~ "Leader",
str_detect(tolower(job_title), "director") ~ "Leader",
str_detect(tolower(job_function), "entry") ~ "Entry",
str_detect(tolower(job_function), "vp") ~ "Senior",
str_detect(tolower(job_function), "vice president") ~ "Leader",
str_detect(tolower(job_function), "manager") ~ "Leader",
str_detect(tolower(job_function), "senior") ~ "Senior",
str_detect(tolower(job_function), "sr") ~ "Senior",
str_detect(tolower(job_function), "principal") ~ "Senior",
str_detect(tolower(job_function), "lead") ~ "Senior",
str_detect(tolower(job_function), "ii") ~ "Senior",
str_detect(tolower(job_function), "2") ~ "Senior",
str_detect(tolower(job_function),"associate") ~ "Senior",
str_detect(tolower(job_function), "director") ~ "Leader",
str_detect(tolower(job_function), "director") ~ "Leader",
TRUE ~ "Entry"
)
) %>%
select(-other,-founded,-industry, -revenue,
-type_of_ownership, -size, -salary_estimate, -location, -state,
-state_division,-state_region) %>%
left_join(state_prefix, by = c("state_abb"="state_abb")) %>%
relocate(job_description, .after = avg_salary) %>%
relocate(mega_job_title, .after = job_title) %>%
relocate(c(state_abb,state_name,state_division,state_region), .after = city)
str(df_data_science_jobs)
## 'data.frame': 8269 obs. of 17 variables:
## $ job_title : chr "Data Analyst" "DATA SCIENCE CONSULTANT BIRMINGHAM" "Data Analyst" "Data Scientist" ...
## $ mega_job_title : chr "Entry" "Entry" "Entry" "Entry" ...
## $ job_function : chr "Data Analyst" "Data Consultant" "Data Analyst" "Data Scientist" ...
## $ company_name : chr "DirectPath\n4.1" "managementsolutions\n3.7" "Ascent Health" "Southern Company\n4.3" ...
## $ sector : chr "Insurance" "Business Services" NA "Oil, Gas, Energy & Utilities" ...
## $ city : chr "Birmingham" "Birmingham" "Birmingham" "Birmingham" ...
## $ state_abb : chr "AL" "AL" "AL" "AL" ...
## $ state_name : chr "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ state_division : chr "East South Central" "East South Central" "East South Central" "East South Central" ...
## $ state_region : chr "South" "South" "South" "South" ...
## $ min_size : num 201 1001 0 10000 501 ...
## $ max_size : num 500 5000 0 10000 1000 10000 10000 200 200 200 ...
## $ min_salary : num 38000 52000 43000 63000 37000 78000 51000 37000 59000 44000 ...
## $ max_salary : num 77000 122000 87000 118000 85000 147000 104000 75000 122000 95000 ...
## $ avg_salary : num 57500 87000 65000 90500 61000 ...
## $ job_description: chr "About DirectPath DirectPath guides employees to make better health care decisions with individualized educatio"| __truncated__ "United States DATA SCIENCE CONSULTANT BIRMINGHAM Birmingham / Internship / Number of vacancies: 2 You wil"| __truncated__ "Ascent Health manages multiple healthcare entities and we are currently hiring for a Laboratory Data Analyst po"| __truncated__ "Data Scientist Description This position is responsible for creating, enhancing, and deploying predictive model"| __truncated__ ...
## $ avg_size : num 350 3000 0 10000 750 ...
### Household Median Income Pull
### Grabbing 9 years of Median Household Income (mhhi) and Population from Census API
hhincome_all <- as_tibble()
all_year <- 2011:2019
y <- 2019
for (y in all_year){
hhincome <- get_acs(geography = "state", year = y, survey = "acs5",
variables = "B19013_001") %>%
select(GEOID, NAME, estimate) %>%
mutate(
### Let's create an id that makes the tables distinct.
### census_id = paste0(y, substr(NAME,7,11)),
year_census = y,
date_census_start = as_date(paste0(y, "-01-01")),
date_census_end = as_date(paste0(y,"-12-31")),
##zcta = substr(NAME,7,11),
state_name = NAME,
mhhi = estimate
) %>%
select(-GEOID, -estimate, -NAME)
hhincome_all <- union_all(hhincome_all, hhincome)
}
### Population Pull
population_all <- as_tibble()
y <- 2019
for (y in all_year){
population_census <- get_acs(geography = "state", year = y, survey = "acs5",
variables = "B01003_001") %>%
select(GEOID, NAME, estimate) %>%
mutate(
### Let's create an id that makes the tables distinct.
### census_id = paste0(y, substr(NAME,7,11)),
year_census = y,
date_census_start = as_date(paste0(y, "-01-01")),
date_census_end = as_date(paste0(y,"-12-31")),
## zcta = substr(NAME,7,11),
state_name = NAME,
population = estimate
) %>%
select(-GEOID, -estimate, -NAME)
population_all <- union_all(population_all, population_census)
}
### lets combine household median income and population
df_census_data <- population_all %>%
left_join(hhincome_all, by = c("year_census" = "year_census",
"date_census_start"="date_census_start",
"date_census_end"="date_census_end",
"state_name"="state_name")) %>%
mutate(
census_current_flag = if_else(max(year_census)== year_census,TRUE,FALSE),
median_household_income = if_else(is.na(mhhi),0,mhhi)
) %>% left_join(state_prefix, by = c("state_name"="state_name")) %>%
mutate(
state_abb = if_else(is.na(state_abb),"DC",state_abb),
state_region = if_else(is.na(state_region),"Northeast",state_region),
state_division = if_else(is.na(state_division),"South Atlantic",state_division)
) %>%
select( -mhhi) %>%
select( year_census, date_census_start, date_census_end, state_abb,state_name,
state_region,state_division, population, median_household_income, census_current_flag)
str(df_census_data)
## tibble [468 × 10] (S3: tbl_df/tbl/data.frame)
## $ year_census : int [1:468] 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
## $ date_census_start : Date[1:468], format: "2011-01-01" "2011-01-01" ...
## $ date_census_end : Date[1:468], format: "2011-12-31" "2011-12-31" ...
## $ state_abb : chr [1:468] "AL" "AK" "AZ" "AR" ...
## $ state_name : chr [1:468] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ state_region : chr [1:468] "South" "West" "West" "South" ...
## $ state_division : chr [1:468] "East South Central" "Pacific" "Mountain" "West South Central" ...
## $ population : num [1:468] 4747424 700703 6337373 2895928 36969200 ...
## $ median_household_income: num [1:468] 42934 69014 50752 40149 61632 ...
## $ census_current_flag : logi [1:468] FALSE FALSE FALSE FALSE FALSE FALSE ...
# https://taxfoundation.org/publications/state-individual-income-tax-rates-and-brackets/
# https://taxfoundation.org/publications/federal-tax-rates-and-tax-brackets/
df_state_tax_long <- read.csv("Data Pull and Transformation/data/Tax/state_income_tax.csv") %>%
mutate(single_brackets = as.numeric(single_brackets),
married_brackets = as.numeric(married_brackets))
df_federal_income_tax <- read.csv("Data Pull and Transformation/data/Tax/federal_income_tax.csv")
df_state_tax_wide <- df_state_tax_long %>% filter(state_abb %notin% c('AK','HI')) %>%
group_by(state_abb) %>%
mutate(
max_single_bracket = if_else(!is.na(lead(single_brackets, 1)),lead(single_brackets, 1),1000000),
max_married_bracket = if_else(!is.na(lead(married_brackets, 1)),lead(married_brackets, 1),1000000),
min_single_bracket = single_brackets,
min_married_bracket = married_brackets
) %>% ungroup() %>%
left_join(state_prefix) %>%
select(state_abb,state_name,single_rates,min_single_bracket,max_single_bracket,min_married_bracket,max_married_bracket)
str(df_state_tax_wide)
## tibble [190 × 7] (S3: tbl_df/tbl/data.frame)
## $ state_abb : chr [1:190] "AL" "AL" "AL" "AZ" ...
## $ state_name : chr [1:190] "Alabama" "Alabama" "Alabama" "Arizona" ...
## $ single_rates : num [1:190] 0.02 0.04 0.05 0.03 0.03 0.04 0.05 0.08 0.02 0.04 ...
## $ min_single_bracket : num [1:190] 0 500 3000 0 27272 ...
## $ max_single_bracket : num [1:190] 500 3000 1000000 27272 54544 ...
## $ min_married_bracket: num [1:190] 0 1000 6000 0 54544 ...
## $ max_married_bracket: num [1:190] 1000 6000 1000000 54544 109088 ...
str(df_federal_income_tax)
## 'data.frame': 7 obs. of 5 variables:
## $ rates : num 0.1 0.12 0.22 0.24 0.32 0.35 0.37
## $ min_single_bracket : int 0 9951 40526 86376 164926 209426 523601
## $ max_single_brackets : int 9950 40525 86375 164925 209425 523600 100000
## $ min_married_brackets: int 0 19902 81052 172752 329852 418852 628301
## $ max_married_brackets: int 19900 81050 172750 329850 418850 628300 100000
df_zillow_city <- read_csv("Data Pull and Transformation/data/City/City_zhvi_bdrmcnt_1_uc_sfrcondo_tier_0.33_0.67_sm_sa_mon.csv")[,-9:-248] %>%
pivot_longer(
cols = contains("-"),
names_to = "date",
values_to = "house_price"
) %>%
mutate(
rent_cost = house_price*(.025*(1+.025)^360)/((1+.025)^360-1)
) %>% mutate(city = RegionName) %>%
select(city, State, StateName, date, Metro,CountyName,house_price, rent_cost)
df_zillow_state <- read_csv("Data Pull and Transformation/data/State/State_zhvi_bdrmcnt_1_uc_sfrcondo_tier_0.33_0.67_sm_sa_mon.csv")[,-6:-245] %>%
pivot_longer(
cols = contains("-"),
names_to = "date",
values_to = "house_price"
) %>%
mutate(
rent_cost = house_price*(.025*(1+.025)^360)/((1+.025)^360-1)
) %>% mutate(state_name = RegionName,
state_abb = StateName) %>%
left_join(state_prefix) %>%
select(state_abb, state_name, state_division, state_region,date, house_price, rent_cost)
### import Data
oneBedroom <- read_csv('Data Pull and Transformation/data/Zip/Zip_zhvi_bdrmcnt_1_uc_sfrcondo_tier_0.33_0.67_sm_sa_mon.csv')
twoBedroom <- read_csv('Data Pull and Transformation/data/Zip/Zip_zhvi_bdrmcnt_2_uc_sfrcondo_tier_0.33_0.67_sm_sa_mon.csv')
threeBedroom <- read_csv('Data Pull and Transformation/data/Zip/Zip_zhvi_bdrmcnt_3_uc_sfrcondo_tier_0.33_0.67_sm_sa_mon.csv')
fourBedroom <- read_csv('Data Pull and Transformation/data/Zip/Zip_zhvi_bdrmcnt_4_uc_sfrcondo_tier_0.33_0.67_sm_sa_mon.csv')
fiveBedroom <- read_csv('Data Pull and Transformation/data/Zip/Zip_zhvi_bdrmcnt_5_uc_sfrcondo_tier_0.33_0.67_sm_sa_mon.csv')
### Clean Columns
oneBedroom <- oneBedroom[,c(1:3,6:9,262:312)]
twoBedroom <- twoBedroom[,c(1:3,6:9,262:312)]
threeBedroom <- threeBedroom[,c(1:3,6:9,262:312)]
fourBedroom <- fourBedroom[,c(1:3,6:9,262:312)]
fiveBedroom <- fiveBedroom[,c(1:3,6:9,262:312)]
### Add Bedroom count, reorder columns
oneBedroom <- oneBedroom %>%
mutate(bedrooms = 1) %>%
rename(Zip = RegionName) %>%
select(c(1:7,bedrooms,8:58))
twoBedroom <- twoBedroom %>%
mutate(bedrooms = 2) %>%
rename(Zip = RegionName) %>%
select(c(1:7,bedrooms,8:58))
threeBedroom <- threeBedroom %>%
mutate(bedrooms = 3) %>%
rename(Zip = RegionName) %>%
select(c(1:7,bedrooms,8:58))
fourBedroom <- fourBedroom %>%
mutate(bedrooms = 4) %>%
rename(Zip = RegionName) %>%
select(c(1:7,bedrooms,8:58))
fiveBedroom <- fiveBedroom %>%
mutate(bedrooms = 5) %>%
rename(Zip = RegionName) %>%
select(c(1:7,bedrooms,8:58))
oneBedroom$Zip <- clean.zipcodes(oneBedroom$Zip)
twoBedroom$Zip <- clean.zipcodes(twoBedroom$Zip)
threeBedroom$Zip <- clean.zipcodes(threeBedroom$Zip)
fourBedroom$Zip <- clean.zipcodes(fourBedroom$Zip)
fiveBedroom$Zip <- clean.zipcodes(fiveBedroom$Zip)
### Bind to one data frame, melt dates
allhomes <- rbind(oneBedroom, twoBedroom, threeBedroom, fourBedroom, fiveBedroom)
allhomes <- allhomes %>% arrange(State, Zip, bedrooms)
allhomes <- allhomes[which(!(allhomes$State == 'HI' | allhomes$State == 'AK' | allhomes$State == 'DC')),]
length(unique(allhomes$State))
## [1] 48
allhomesClean <- reshape2::melt(allhomes, id=c(1:8)) %>%
rename(house_expense = value, dates = variable)
### add zip code locations
data("zipcode")
allhomesLocation = merge(allhomesClean, zipcode, by.x = 'Zip', by.y = 'zip') # Then merge with the zipcode data
allhomesLocation <- merge(x = allhomesLocation, y = state_prefix, by.x = "state", by.y = "state_abb" , all.x = TRUE)
str(allhomesLocation)
## 'data.frame': 6219399 obs. of 17 variables:
## $ state : chr "AL" "AL" "AL" "AL" ...
## $ Zip : chr "35091" "35091" "35091" "35091" ...
## $ RegionID : num 73316 73316 73316 73316 73316 ...
## $ SizeRank : num 16685 16685 16685 16685 16685 ...
## $ State : chr "AL" "AL" "AL" "AL" ...
## $ City : chr "Kimberly" "Kimberly" "Kimberly" "Kimberly" ...
## $ Metro : chr "Birmingham-Hoover" "Birmingham-Hoover" "Birmingham-Hoover" "Birmingham-Hoover" ...
## $ CountyName : chr "Jefferson County" "Jefferson County" "Jefferson County" "Jefferson County" ...
## $ bedrooms : num 2 2 4 4 5 3 5 3 2 4 ...
## $ dates : Factor w/ 51 levels "2017-01-31","2017-02-28",..: 17 9 1 38 46 38 20 39 13 5 ...
## $ house_expense : num 91319 86524 235258 269600 313362 ...
## $ city : chr "Kimberly" "Kimberly" "Kimberly" "Kimberly" ...
## $ latitude : num 33.8 33.8 33.8 33.8 33.8 ...
## $ longitude : num -86.8 -86.8 -86.8 -86.8 -86.8 ...
## $ state_name : chr "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ state_division: chr "East South Central" "East South Central" "East South Central" "East South Central" ...
## $ state_region : chr "South" "South" "South" "South" ...
df_data_science_jobs %>% filter(state_region %in% c("West","Northeast")) %>%
ggplot(aes(state_abb, avg_salary)) +
geom_boxplot(color = "red", fill = "white") +
stat_smooth(method = "lm") +
labs(title = "Hot Spot States within US") +
xlab("States") +
ylab("Avg Salary") +
scale_y_continuous(label = comma) +
theme_minimal() +
theme(text = element_text(size= 20))
df_data_science_jobs %>% filter(state_region %in% c("Northeast","West","South", "North Central")) %>%
ggplot(aes(state_region, avg_salary)) +
geom_boxplot(color = "red", fill = "white") +
stat_smooth(method = "lm") +
labs(title = "Salary Comparison by Regions") +
xlab("US Regions") +
ylab("Avg Salary") +
scale_y_continuous(label = comma) +
theme_minimal() +
theme(text = element_text(size= 20))
#now lets compare specific job titles and average salaries
#which one is considered better or more frequent?
df_data_science_jobs %>% filter(mega_job_title %in% c("Entry","Senior","Leader") & avg_salary < 200000) %>%
ggplot() +
aes(x = avg_salary) +
geom_histogram(bins = 15, color = "red", fill = "white") +
scale_x_continuous(labels = scales::comma) +
labs(title = "Data Scientest",
x = "Avg Salary",
y = "Salary Count") +
theme_minimal() +
theme(text = element_text(size= 20))
df_data_science_jobs %>% filter(mega_job_title %in% c("Entry") & avg_salary < 200000) %>%
ggplot() +
aes(x = avg_salary) +
geom_histogram(bins = 20, color = "red", fill = "white") +
scale_x_continuous(labels = scales::comma) +
labs(title = "Entry Level Data Scientest",
x = "Avg Salary",
y = "Salary Count") +
theme_minimal() +
theme(text = element_text(size= 20))
df_data_science_jobs %>% filter(mega_job_title %in% c("Senior") & avg_salary < 200000) %>%
ggplot() +
aes(x = avg_salary) +
geom_histogram(bins = 20, color = "red", fill = "white") +
scale_x_continuous(labels = scales::comma) +
labs(title = "Senior Level Data Scientest",
x = "Avg Salary",
y = "Salary Count") +
theme_minimal() +
theme(text = element_text(size= 20))
df_data_science_jobs %>% filter(mega_job_title %in% c("Leader") & avg_salary < 200000) %>%
ggplot() +
aes(x = avg_salary) +
geom_histogram(bins = 20, color = "red", fill = "white") +
scale_x_continuous(labels = scales::comma) +
labs(title = "Leader Level Data Scientest",
x = "Avg Salary",
y = "Salary Count") +
theme_minimal() +
theme(text = element_text(size= 20))
census_data <- df_census_data %>% filter(census_current_flag == TRUE &
state_abb != "AK",
state_abb != "HI",
state_abb != "DC") %>%
mutate(state_name = tolower(state_name))
#now make an analysis from the historical data from past 9 years
census_data %>% ggplot(aes(reorder(state_abb,desc(population)), population)) +
geom_col(color = "red", fill = "white") +
labs(x = "State Name", y = "Population", title = "State Population") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 80, vjust = 0.5, hjust = 1)) +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90),
text = element_text(size= 20))
# fig.height=5, fig.width=10
ggplotly(ggplot(census_data) +
aes(x = population, y = median_household_income) +
geom_point(shape = "bullet", size = 3.9, colour = "#B22222") +
labs(x = "State Population",y = "Median Income",title = "Population vs. House Income") +
scale_y_continuous(labels = scales::comma) +
theme_minimal() +
theme(text = element_text(size= 20)))
# Federal Income Tax
str(df_federal_income_tax)
## 'data.frame': 7 obs. of 5 variables:
## $ rates : num 0.1 0.12 0.22 0.24 0.32 0.35 0.37
## $ min_single_bracket : int 0 9951 40526 86376 164926 209426 523601
## $ max_single_brackets : int 9950 40525 86375 164925 209425 523600 100000
## $ min_married_brackets: int 0 19902 81052 172752 329852 418852 628301
## $ max_married_brackets: int 19900 81050 172750 329850 418850 628300 100000
# State Income Tax
str(df_state_tax_long)
## 'data.frame': 203 obs. of 5 variables:
## $ state_abb : chr "AL" "AL" "AL" "AK" ...
## $ single_rates : num 0.02 0.04 0.05 0 0.03 0.03 0.04 0.05 0.08 0.02 ...
## $ single_brackets : num 0 500 3000 0 0 ...
## $ married_rates : num 0.02 0.04 0.05 0 0.03 0.03 0.04 0.05 0.08 0.02 ...
## $ married_brackets: num 0 1000 6000 0 0 ...
### Create filtered data sets and maps
filtered <- allhomesLocation %>%
filter(bedrooms == 1 & between(dates, as.factor('2020-01-31'), as.factor('2020-12-31')) &
!is.na(house_expense) & house_expense < 1000000) %>%
group_by(Zip, state_name, latitude, longitude) %>%
summarise(house_expense = mean(house_expense))
filteredState <- allhomesLocation %>%
filter(bedrooms == 1 & between(dates, as.factor('2020-01-31'), as.factor('2020-12-31')) &
!is.na(house_expense)) %>%
group_by(state_name) %>%
summarise(house_expense = mean(house_expense))
us <- map_data("state") %>% mutate(region = str_to_title(region))
#US Map
ggplot(filtered, aes(map_id = state_name)) +
geom_map(map = us, fill = "gray") +
expand_limits(x = us$long, y = us$lat) +
geom_point(aes(x = longitude, y = latitude, color = house_expense)) +
coord_map() +
scale_fill_gradient(low="beige", high="blue") +
theme_map() +
ggtitle("2020 Average House Expense by Zip") +
labs(color = "House Expense") +
theme(plot.title = element_text(size=16))
#NE Map
ggplot(filtered, aes(map_id = state_name)) +
geom_map(map = us, fill = "gray") +
expand_limits(x = us$long, y = us$lat) +
geom_point(aes(x = longitude, y = latitude, color = house_expense)) +
coord_map() +
scale_fill_gradient(low="beige", high="blue") +
theme_map() +
xlim(-80,-67) +
ylim(37.5,47.5) +
ggtitle("2020 Average House Expense by Zip - NE") +
labs(color = "House Expense") +
theme(plot.title = element_text(size=16))
#California Map
ggplot(filtered, aes(map_id = state_name)) +
geom_map(map = us, fill = "gray") +
expand_limits(x = us$long, y = us$lat) +
geom_point(aes(x = longitude, y = latitude, color = house_expense)) +
coord_map() +
xlim(-125,-115) +
ylim(33, 42.5) +
scale_fill_gradient(low="beige", high="blue") +
ggtitle("2020 Average House Expense by Zip - California") +
labs(color = "House Expense") +
theme(plot.title = element_text(size=16)) +
theme_map()
job_description <- df_data_science_jobs$job_description
head(job_description,1)
## [1] "About DirectPath DirectPath guides employees to make better health care decisions with individualized education for selecting the right benefit plan, expert assistance in making informed care choices and rewards for sensible financial decisions. Position Summary The Data Analyst functions as a key player to ensure data accuracy for client facing reports & internal applications. The role will coordinate with Client Relationship Managers and Solutions Architect to develop and fix carrier, payroll, and other ad hoc reports based on client/carrier requirements using bSwift and SQL Server Management Studio (SSMS). The Data Analyst can be located in Birmingham, AL or Milwaukee, WI. Relocation assistance, for this position, is not available. As an industry leader in the employee engagement space, DirectPath offers our full-time regular employees a comprehensive and competitive compensation and employee benefits package. With our contribution toward the monthly premium, employees and their families have the choice of a generous group PPO medical plan or a HDHP with a health savings account option. We also provide company paid long term disability, short term disability, basic life insurance and a critical illness insurance policy. Additional benefits an employee may choose are dental, vision, additional life insurance, accident insurance, additional critical illness insurance, pet insurance and flexible spending accounts. Retirement planning is important too. Employees are eligible to participate in the 401(k) on the first day of employment and eligible for the company match after 6 months. Employees also enjoy a flexible time off plan that allows a new hire up to 15 days plus 9 holidays in the first year! (prorated based on date of hire, calendar year)"
df_state_tax_wide %>% filter(max_single_bracket >= 100000 & min_single_bracket <= 100000) %>%
ggplot(aes(map_id = state_name)) +
geom_map(map = us, aes(fill = single_rates)) +
expand_limits(x = us$long, y = us$lat) +
scale_fill_gradient(labels = percent) +
labs(fill = "Tax Rate") +
ylab("") +
xlab("") +
coord_map() +
ggtitle("Single Tax Rate") +
theme_map() +
theme(plot.title = element_text(size=22))
df_state_tax_wide %>% filter(max_married_bracket >= 100000 & min_married_bracket <= 100000) %>%
ggplot(aes(map_id = state_name)) +
geom_map(map = us, aes(fill = single_rates)) +
expand_limits(x = us$long, y = us$lat) +
scale_fill_gradient(labels = percent) +
labs(fill = "Tax Rate") +
ylab("") +
xlab("") +
coord_map() +
ggtitle("Married Tax Rate") +
theme_map() +
theme(plot.title = element_text(size=22))
#looking back at data scientists salaries
ggplot(df_data_science_jobs, mapping = aes(x = max_salary)) +
geom_freqpoly(mapping = aes(colour = state_region), binwidth = 500) +
scale_x_continuous(labels = comma) +
ggtitle("Maximum Salary Analysis") +
labs(
x = "Max Salary",
y = "Max Salary Count",
color = "State Region"
) +
theme_minimal() +
theme(text = element_text(size= 20))
ggplot(df_data_science_jobs, mapping = aes(x = min_salary)) +
geom_freqpoly(mapping = aes(colour = state_region), binwidth = 500) +
scale_x_continuous(labels = comma) +
ggtitle("Minimum Salary Analysis") +
labs(
x = "Min Salary",
y = "Min Salary Count",
color = "State Region"
) +
theme_minimal() +
theme(text = element_text(size= 20))
#lets compare to a texas! still high population but more spread out rural land
state_data <- subset(df_data_science_jobs, state_abb %in% c("TX","CA"), select = c(state_abb, min_salary, max_salary, avg_salary))
ggplot(state_data) +
aes(x = avg_salary, y = ..scaled.., colour = state_abb) +
geom_density(adjust = 1L, fill = NA) +
scale_color_hue(direction = 1) +
scale_x_continuous(labels = comma) +
scale_y_continuous(labels = comma) +
labs(title = "Texas vs. California Salary Comparison",
x = "Avg Salary",
y = "Scaled",
color = "State Abbreviation") +
theme_minimal() +
theme(text = element_text(size= 20))
description <- tolower(df_data_science_jobs$job_description)
#sylFile <- "txt_files/ist_687_syllabus.txt"
# input <- readLines(description)
tail(description, 1)
## [1] "bachelor’s degree in a highly quantitative field (computer science, machine learning, operational research, statistics, mathematics, etc.) or equivalent professional or military experience experience with ml fields, e.g., natural language processing, computer vision, statistical learning theory 6+ years of industry experience in predictive modeling, data science, and analysis experience in an ml engineer or data scientist role building and deploying ml models or hands on experience developing deep learning models experience writing code in python, r, scala, java, c++ with documentation for reproducibility experience handling terabyte size datasets, diving into data to discover hidden patterns, using data visualization tools, writing sql, and working with gpus to develop models want to help the largest global enterprises derive business value through the adoption of artificial intelligence (ai) and machine learning (ml)? excited by using massive amounts of disparate data to develop ml models? eager to learn to apply ml to a diverse array of enterprise use cases? thrilled to be a part of amazon who has been pioneering and shaping the world’s ai/ml technology for decades? at amazon web services (aws), we are helping large enterprises build ml models on the aws cloud. we are applying predictive technology to large volumes of data and against a wide spectrum of problems. aws professional services works together with aws customers to address their business needs using ai solutions. aws professional services is a unique consulting team. we pride ourselves on being customer obsessed and highly focused on the ai enablement of our customers. if you have experience with ai, including building ml models, we’d like to have you join our team. you will get to work with an innovative company, with great teammates, and have a lot of fun helping our customers. a successful candidate will be a person who enjoys diving deep into data, doing analysis, discovering root causes, and designing long-term solutions. this is a customer-facing role and you will be required to travel to client locations and deliver professional services as needed. amazon is committed to a diverse and inclusive workplace. amazon is an equal opportunity employer and does not discriminate on the basis of race, national origin, gender, gender identity, sexual orientation, protected veteran status, disability, age, or other legally protected status. for individuals with disabilities who would like to request an accommodation, please visit https://www.amazon.jobs/en/disability/us. pursuant to the san francisco fair chance ordinance, we will consider for employment qualified applicants with arrest and conviction records. pursuant to the los angeles fair chance ordinance, we will consider for employment qualified applicants with arrest and conviction records. for employees based in colorado, this position starts at $116,200 per year. a sign-on bonus and restricted stock units may be provided as part of the compensation package, in addition to a range of medical, financial, and/or other benefits, dependent on the position offered."
str(description)
## chr [1:8269] "about directpath directpath guides employees to make better health care decisions with individualized educatio"| __truncated__ ...
### Word cloud
#use TM package
#create the word list with counts
words.vec <- VectorSource(description)
words.corpus <- Corpus(words.vec)
words.corpus
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 8269
tdm <- TermDocumentMatrix(words.corpus)
str(tdm)
## List of 6
## $ i : int [1:927689] 1 2 3 4 5 6 7 8 9 10 ...
## $ j : int [1:927689] 1 1 1 1 1 1 1 1 1 1 ...
## $ v : num [1:927689] 1 1 1 1 1 1 1 1 3 1 ...
## $ nrow : int 43461
## $ ncol : int 8269
## $ dimnames:List of 2
## ..$ Terms: chr [1:43461] "(prorated" "(ssms)." "401(k)" "about" ...
## ..$ Docs : chr [1:8269] "1" "2" "3" "4" ...
## - attr(*, "class")= chr [1:2] "TermDocumentMatrix" "simple_triplet_matrix"
## - attr(*, "weighting")= chr [1:2] "term frequency" "tf"
#explore what we created
str(tdm)
## List of 6
## $ i : int [1:927689] 1 2 3 4 5 6 7 8 9 10 ...
## $ j : int [1:927689] 1 1 1 1 1 1 1 1 1 1 ...
## $ v : num [1:927689] 1 1 1 1 1 1 1 1 3 1 ...
## $ nrow : int 43461
## $ ncol : int 8269
## $ dimnames:List of 2
## ..$ Terms: chr [1:43461] "(prorated" "(ssms)." "401(k)" "about" ...
## ..$ Docs : chr [1:8269] "1" "2" "3" "4" ...
## - attr(*, "class")= chr [1:2] "TermDocumentMatrix" "simple_triplet_matrix"
## - attr(*, "weighting")= chr [1:2] "term frequency" "tf"
inspect(tdm[1:20,1:20])
## <<TermDocumentMatrix (terms: 20, documents: 20)>>
## Non-/sparse entries: 63/337
## Sparsity : 84%
## Maximal term length: 13
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms 1 11 13 15 18 4 5 6 7 9
## (prorated 1 0 0 0 0 0 0 0 0 0
## (ssms). 1 0 0 0 0 0 0 0 0 0
## 401(k) 1 0 0 0 0 0 0 0 0 0
## about 1 0 0 0 0 0 1 4 4 0
## accuracy 1 0 0 0 0 0 0 1 1 0
## additional 3 0 0 0 0 0 0 0 0 0
## also 2 1 0 0 1 0 0 3 2 0
## analyst 2 2 0 0 1 0 0 0 0 0
## and 11 9 9 16 15 14 9 29 29 8
## are 2 0 2 0 0 0 0 2 2 0
m <- as.matrix(tdm)
wordCounts <- rowSums(m)
head(wordCounts)
## (prorated (ssms). 401(k) about accident account
## 1 1 59 2482 11 152
wordCounts <- sort(wordCounts, decreasing = TRUE)
head(wordCounts)
## and the data with for our
## 85747 50189 27548 18253 18151 17135
# wordcloud(names(wordCounts), wordCounts)
# now remove stuff we don't care about
words.corpus <- words.corpus %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(removeWords, stopwords("english")) %>%
tm_map(removeWords, c("data","will","analytics","learning","analysis","position","role","job","new","work"))
# recreate the wordcount
tdm <- TermDocumentMatrix(words.corpus)
tdm
## <<TermDocumentMatrix (terms: 24842, documents: 8269)>>
## Non-/sparse entries: 736896/204681602
## Sparsity : 100%
## Maximal term length: 146
## Weighting : term frequency (tf)
# Summary on are clean data
m <- as.matrix(tdm)
wordCounts <- rowSums(m)
wordCounts <- sort(wordCounts, decreasing = TRUE)
head(wordCounts, 20)
## business team experience science solutions support
## 8901 7943 7791 5019 4582 3930
## development technology company services machine management
## 3613 3600 3490 3318 3236 3057
## help information customers software develop people
## 3013 2992 2787 2766 2760 2743
## across analyst
## 2742 2720
# Create word cloud
cloudFrame <- data.frame(word=names(wordCounts), freq = wordCounts)
wordcloud(names(wordCounts), wordCounts, min.freq = 5, max.words = 50, rot.per = 0.35,
colors = brewer.pal(8, "Dark2"))
description <- df_data_science_jobs$job_title
#sylFile <- "txt_files/ist_687_syllabus.txt"
# input <- readLines(description)
tail(description, 20)
## [1] "Senior Associate, ServiceNow Consulting"
## [2] "Senior Associate, Information Security Engineer - Managed Security Services"
## [3] "Senior Business Analyst/Product Manager, Data & Analytics – Clinical/Research"
## [4] "Sr. Consumer Data Analyst (Remote or in-office)"
## [5] "Senior Data Engineer"
## [6] "SENIOR BUSINESS ANALYST/PRODUCT MANAGER, DATA AND ANALYTICS PLATFORM"
## [7] "SENIOR BUSINESS ANALYST/PRODUCT MANAGER, DATA & ANALYTICS – CLINICAL/RESEARCH"
## [8] "Data Architect Administrator (Remote)"
## [9] "Sr. Cloud Software Engineer - AI Engineering - Remote"
## [10] "Senior Financial Data Analyst"
## [11] "Senior Associate, Cloud DevOps Engineer"
## [12] "Entegral Data Enterprise Engineer/Architect (Remote)"
## [13] "Senior Data Analyst"
## [14] "Senior Machine Learning Engineer"
## [15] "Manager, Full Stack Developer"
## [16] "Senior Machine Learning Engineer"
## [17] "Senior Statistician"
## [18] "Senior Machine Learning Engineer"
## [19] "Statistician Technician"
## [20] "Senior Data Scientist - ProServe"
str(description)
## chr [1:8269] "Data Analyst" "DATA SCIENCE CONSULTANT BIRMINGHAM" ...
### Word cloud
#use TM package
#create the word list with counts
words.vec <- VectorSource(description)
words.corpus <- Corpus(words.vec)
words.corpus
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 8269
tdm <- TermDocumentMatrix(words.corpus)
str(tdm)
## List of 6
## $ i : int [1:30953] 1 2 3 4 2 5 1 2 2 6 ...
## $ j : int [1:30953] 1 1 2 2 2 2 3 3 4 4 ...
## $ v : num [1:30953] 1 1 1 1 1 1 1 1 1 1 ...
## $ nrow : int 2663
## $ ncol : int 8269
## $ dimnames:List of 2
## ..$ Terms: chr [1:2663] "analyst" "data" "birmingham" "consultant" ...
## ..$ Docs : chr [1:8269] "1" "2" "3" "4" ...
## - attr(*, "class")= chr [1:2] "TermDocumentMatrix" "simple_triplet_matrix"
## - attr(*, "weighting")= chr [1:2] "term frequency" "tf"
#explore what we created
str(tdm)
## List of 6
## $ i : int [1:30953] 1 2 3 4 2 5 1 2 2 6 ...
## $ j : int [1:30953] 1 1 2 2 2 2 3 3 4 4 ...
## $ v : num [1:30953] 1 1 1 1 1 1 1 1 1 1 ...
## $ nrow : int 2663
## $ ncol : int 8269
## $ dimnames:List of 2
## ..$ Terms: chr [1:2663] "analyst" "data" "birmingham" "consultant" ...
## ..$ Docs : chr [1:8269] "1" "2" "3" "4" ...
## - attr(*, "class")= chr [1:2] "TermDocumentMatrix" "simple_triplet_matrix"
## - attr(*, "weighting")= chr [1:2] "term frequency" "tf"
inspect(tdm[1:20,1:20])
## <<TermDocumentMatrix (terms: 20, documents: 20)>>
## Non-/sparse entries: 51/349
## Sparsity : 87%
## Maximal term length: 17
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms 1 10 14 16 17 2 3 4 5 9
## analyst 1 1 1 0 0 0 1 0 0 0
## assurance 0 0 0 0 0 0 0 0 1 0
## birmingham 0 0 0 0 0 1 0 0 0 0
## consultant 0 0 0 0 0 1 0 0 0 0
## data 1 1 1 1 1 1 1 1 0 0
## engineer 0 0 0 1 1 0 0 0 0 1
## junior 0 0 0 1 0 0 0 0 0 1
## science 0 0 0 0 1 1 0 0 0 0
## scientist 0 0 0 0 0 0 0 1 0 0
## software 0 0 0 0 0 0 0 0 1 1
m <- as.matrix(tdm)
wordCounts <- rowSums(m)
head(wordCounts)
## analyst data birmingham consultant science scientist
## 2049 6199 3 162 548 2229
wordCounts <- sort(wordCounts, decreasing = TRUE)
head(wordCounts)
## data scientist analyst engineer senior analytics
## 6199 2229 2049 1706 1058 611
# wordcloud(names(wordCounts), wordCounts)
# now remove stuff we don't care about
words.corpus <- words.corpus %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(removeWords, stopwords("english")) %>%
tm_map(removeWords, c("data","engineer","scientist","science","analyst","analytics","business","engineering","software"))
# recreate the wordcount
tdm <- TermDocumentMatrix(words.corpus)
tdm
## <<TermDocumentMatrix (terms: 1976, documents: 8269)>>
## Non-/sparse entries: 15187/16324357
## Sparsity : 100%
## Maximal term length: 33
## Weighting : term frequency (tf)
# Summary on are clean data
m <- as.matrix(tdm)
wordCounts <- rowSums(m)
wordCounts <- sort(wordCounts, decreasing = TRUE)
head(wordCounts, 20)
## senior learning manager machine remote intelligence
## 1062 465 441 427 288 213
## lead associate developer research director consultant
## 211 210 204 184 181 175
## marketing architect specialist operations product –
## 173 151 138 131 120 113
## principal systems
## 108 105
# Create word cloud
cloudFrame <- data.frame(word=names(wordCounts), freq = wordCounts)
wordcloud(names(wordCounts), wordCounts, min.freq = 3, max.words = 50, rot.per = 0.35,
colors = brewer.pal(8, "Dark2"))
# Creating the average rent cost
df_zillow_avg_state <- df_zillow_state %>% filter(between(as.Date(date),as.Date("2020-01-31"),as.Date("2020-12-31")), state_abb != "DC") %>%
group_by(state_abb, state_name, state_division,state_region) %>%
summarise(
rent_cost = mean(rent_cost),
.groups = "drop"
) %>%
ungroup()
### Lets bring the census data into our df_data_science_job data set
model_data_set <- df_data_science_jobs %>%
left_join(df_census_data) %>%
filter(census_current_flag) %>%
left_join(df_zillow_avg_state) %>%
select(-job_description, -year_census, -date_census_start, -date_census_end,
-census_current_flag, -min_salary, -max_salary) %>%
rename(mhhi = median_household_income)
## Joining, by = c("state_abb", "state_name", "state_division", "state_region")
## Joining, by = c("state_abb", "state_name", "state_division", "state_region")
model_data_science_1 <- lm(avg_salary ~ population + mhhi, model_data_set)
model_summary <- summary(model_data_science_1)
model_summary
##
## Call:
## lm(formula = avg_salary ~ population + mhhi, data = model_data_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -79968 -14591 -128 13300 212080
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.648e+04 1.496e+03 31.06 <2e-16 ***
## population 5.722e-04 2.751e-05 20.80 <2e-16 ***
## mhhi 6.346e-01 2.216e-02 28.64 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22240 on 8266 degrees of freedom
## Multiple R-squared: 0.1356, Adjusted R-squared: 0.1354
## F-statistic: 648.3 on 2 and 8266 DF, p-value: < 2.2e-16
model_data_science_2 <- lm(avg_salary ~ population + mhhi + avg_size, model_data_set)
model_summary <- summary(model_data_science_2)
model_summary
##
## Call:
## lm(formula = avg_salary ~ population + mhhi + avg_size, data = model_data_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -77385 -14787 108 12923 209688
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.856e+04 1.513e+03 25.48 <2e-16 ***
## population 5.992e-04 2.690e-05 22.28 <2e-16 ***
## mhhi 6.691e-01 2.170e-02 30.83 <2e-16 ***
## avg_size 1.078e+00 5.360e-02 20.11 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21720 on 8265 degrees of freedom
## Multiple R-squared: 0.1759, Adjusted R-squared: 0.1756
## F-statistic: 588 on 3 and 8265 DF, p-value: < 2.2e-16
model_data_science_3 <- lm(avg_salary ~ population + mhhi + avg_size + mega_job_title, model_data_set)
model_summary <- summary(model_data_science_3)
model_summary
##
## Call:
## lm(formula = avg_salary ~ population + mhhi + avg_size + mega_job_title,
## data = model_data_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75202 -14006 681 12980 193745
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.072e+04 1.448e+03 21.22 <2e-16 ***
## population 7.027e-04 2.556e-05 27.49 <2e-16 ***
## mhhi 7.187e-01 2.051e-02 35.05 <2e-16 ***
## avg_size 7.806e-01 5.136e-02 15.20 <2e-16 ***
## mega_job_titleLeader 2.236e+04 8.866e+02 25.23 <2e-16 ***
## mega_job_titleSenior 1.329e+04 5.384e+02 24.68 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20460 on 8263 degrees of freedom
## Multiple R-squared: 0.2685, Adjusted R-squared: 0.2681
## F-statistic: 606.7 on 5 and 8263 DF, p-value: < 2.2e-16
model_data_science_4 <- lm(avg_salary ~ population + mhhi + avg_size + mega_job_title + rent_cost, model_data_set)
model_summary <- summary(model_data_science_4)
model_summary
##
## Call:
## lm(formula = avg_salary ~ population + mhhi + avg_size + mega_job_title +
## rent_cost, data = model_data_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -78107 -13716 832 12857 192633
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.641e+04 1.584e+03 22.989 <2e-16 ***
## population 5.582e-04 3.043e-05 18.342 <2e-16 ***
## mhhi 5.655e-01 2.701e-02 20.938 <2e-16 ***
## avg_size 7.969e-01 5.116e-02 15.576 <2e-16 ***
## mega_job_titleLeader 2.241e+04 8.826e+02 25.394 <2e-16 ***
## mega_job_titleSenior 1.324e+04 5.360e+02 24.698 <2e-16 ***
## rent_cost 1.369e+00 1.580e-01 8.661 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20370 on 8262 degrees of freedom
## Multiple R-squared: 0.2751, Adjusted R-squared: 0.2746
## F-statistic: 522.6 on 6 and 8262 DF, p-value: < 2.2e-16
model_data_science_5 <- lm(avg_salary ~ population + mhhi + mega_job_title + state_region + rent_cost + avg_size, model_data_set)
model_summary <- summary(model_data_science_5)
model_summary
##
## Call:
## lm(formula = avg_salary ~ population + mhhi + mega_job_title +
## state_region + rent_cost + avg_size, data = model_data_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -74160 -13668 627 12698 198411
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.587e+04 1.657e+03 21.654 < 2e-16 ***
## population 5.364e-04 3.074e-05 17.448 < 2e-16 ***
## mhhi 5.295e-01 2.910e-02 18.195 < 2e-16 ***
## mega_job_titleLeader 2.271e+04 8.797e+02 25.820 < 2e-16 ***
## mega_job_titleSenior 1.346e+04 5.346e+02 25.176 < 2e-16 ***
## state_regionNortheast -2.577e+03 7.553e+02 -3.412 0.000647 ***
## state_regionSouth 1.187e+03 5.590e+02 2.123 0.033800 *
## state_regionWest -6.822e+03 1.061e+03 -6.428 1.37e-10 ***
## rent_cost 2.253e+00 1.960e-01 11.496 < 2e-16 ***
## avg_size 8.143e-01 5.115e-02 15.918 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20280 on 8259 degrees of freedom
## Multiple R-squared: 0.282, Adjusted R-squared: 0.2812
## F-statistic: 360.5 on 9 and 8259 DF, p-value: < 2.2e-16
model_coefficients <- model_summary$coefficients
RootMeanSq <- function(error){
sqrt(mean(error^2))
}
model_predict <- predict(model_data_science_5, model_data_set)
model_error <- abs(model_data_set$avg_salary - model_predict)
mean(model_data_set$avg_salary)
## [1] 94260.91
RootMeanSq(model_error)
## [1] 20264.09
### Lets look at just Senior and Entry. I think the model is doing really good here.
model_entry_senior <- model_data_set %>% filter(mega_job_title %in% c("Senior","Entry"))
mean(model_entry_senior$avg_salary)
## [1] 93002.67
model_predict <- predict(model_data_science_5, model_entry_senior)
model_error <- abs(model_entry_senior$avg_salary - model_predict)
RootMeanSq(model_error)
## [1] 19758.7
# Setting the data set used for the model to a new dataframe
df_corr <- model_data_set
# Finding the unique values for job level and region for the correlation matrix
unique(df_corr$mega_job_title)
## [1] "Entry" "Senior" "Leader"
unique(df_corr$state_region)
## [1] "South" "West" "Northeast" "North Central"
# Setting 0, 1, 2 for each of the job levels
df_corr <- df_corr %>% mutate(job_title_num = case_when(
df_corr$mega_job_title == "Entry" ~ 0,
df_corr$mega_job_title == "Senior" ~ 1,
df_corr$mega_job_title == "Leader" ~ 2
))
# Setting 0, 1, 2, 3 for each of the regions
df_corr <- df_corr %>% mutate(state_region_num = case_when(
df_corr$state_region == "South" ~ 0,
df_corr$state_region == "West" ~ 1,
df_corr$state_region == "Northeast" ~ 2,
df_corr$state_region == "North Central" ~ 3
))
# Selecting the only columns used for the matrix
df_corr <- df_corr[13:19]
# Changing the column names so they look nicer on the matrix
column_names <- c("Avg Salary", "Avg Size","Population", "Median Income","Rent Cost","Job Level", "Region")
colnames(df_corr) <- column_names
# Setting the dataframe to correlation
corr <- cor(df_corr)
# Plotting the correlation
ggcorrplot(corr, method = "circle", hc.order = TRUE, type = "upper") +
scale_fill_gradient2(low = "purple", high = "salmon", mid = "white", midpoint = 0,
limit = c(-1,1), space = "Lab", name="Correlation") +
labs(title = "Correlation Matrix") + theme_minimal() +
theme(axis.title.x = element_blank(), axis.title.y = element_blank()) +
theme(text = element_text(size= 20))
### We are going to remove outliers based on the histogram. WHich is job over 200k
cost_living <- df_data_science_jobs %>% filter(state_abb %notin% c('DC','AK') &
avg_salary < 200000) %>%
group_by(state_abb, state_name, state_division, state_region) %>%
summarise(avg_salary = mean(avg_salary),
num_jobs = length(job_title),
.groups = "drop") %>% ungroup() %>%
left_join(df_state_tax_wide) %>%
filter(max_single_bracket >= avg_salary & min_single_bracket <= avg_salary) %>%
select(state_abb, state_name, state_region, state_division, avg_salary, single_rates, num_jobs) %>%
crossing(df_federal_income_tax) %>%
filter(max_single_brackets >= avg_salary & min_single_bracket <= avg_salary) %>%
mutate(
gross_salary = avg_salary,
state_tax_rate = single_rates,
federal_tax_rate = rates
) %>%
left_join(df_zillow_avg_state) %>%
left_join(df_census_data) %>% filter(census_current_flag==TRUE) %>%
mutate(
net_salary = gross_salary * (1 - (state_tax_rate + federal_tax_rate)) - rent_cost,
chge_salary = net_salary /gross_salary -1,
above_median = gross_salary / median_household_income - 1
) %>%
select(state_abb,state_name, median_household_income,above_median,
chge_salary, gross_salary, net_salary, state_tax_rate, federal_tax_rate, rent_cost, num_jobs)
fill <- c("Gross Salary" = "light blue", "Net Salary" = "salmon")
cost_living %>% ggplot() +
geom_col(aes(reorder(state_abb,desc(gross_salary)), gross_salary, fill = "Gross Salary"), color = "black") +
geom_col(aes(reorder(state_abb,desc(gross_salary)), net_salary, fill = "Net Salary"), color = "black") +
scale_y_continuous(labels = scales::comma) +
labs(title = "Gross vs. Net Salary by State",
x = "State",
y = "Salary",
fill = "Legend") +
scale_fill_manual(values = fill) +
theme(axis.text.x = element_text(angle = 45),
legend.position = "bottom",
text = element_text(size= 20))
cost_living %>% ggplot(aes(reorder(state_abb,desc(above_median)), above_median)) +
geom_col(fill = "#DA66AC", color = "black") +
scale_y_continuous(labels = scales::percent) +
labs(title = "Data Science Salary Over Median Household Income",
x = "State",
y = "Salary") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45),
text = element_text(size= 20))
cost_living %>% ggplot(aes(reorder(state_abb,desc(num_jobs)), num_jobs)) +
geom_col(fill = "#EE828A", color = "black") +
labs(title = "# of Data Science Jobs",
x = "State",
y = "Salary") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45),
text = element_text(size= 20))